1. Preliminaries: Load Libraries and Log into Twitter

library(rtweet)
library(httpuv)
library(tidyverse)
## Warning: package 'tidyr' was built under R version 4.0.5
library(tidytext)
library(wordcloud2)
library(qdapRegex)
library(tm)
library(webshot)
library(htmlwidgets)
library(ggplot2)
library(sf)
library(tmap)
library(knitr)

2. Pull Data from the Twitter API via rtweet Package

# Pull tweets with #CancelStudentDebt; returns 1000 most recent tweets; time by GMT
student_debt_tweets<-search_tweets(q="#CancelStudentDebt", 
                                   n=1000,
                                   include_rts=FALSE,
                                   `-filter`="replies",
                                   lang="en")
# Pull tweets with #CancelStudentDebt AND capitalism
student_debt_capitalism_tweets<-search_tweets(q="#CancelStudentDebt capitalism", 
                                              n=1000,
                                              include_rts=FALSE,
                                              `-filter`="replies",
                                              lang="en")
# Pull tweets with #CancelStudentDebt OR capitalism

student_debt_OR_capitalism_tweets<-search_tweets(q="#CancelStudentDebt OR capitalism", 
                                                 n=1000,
                                                 include_rts=FALSE,
                                                 `-filter`="replies",
                                                 lang="en")
# # Pull tweets from an account (doesn't have same time constraints)
# Pull last 3200 BLM tweets (note sometimes the query will return less than 3200 due to deletions)
blm_tweets<-get_timeline("@Blklivesmatter", n=3200)

3. Clean, Organize, and Query Twitter Datasets

3.1. Query blm_tweets to find the 10 tweets with the most favorites

blm_tweets_most_favorited<-blm_tweets %>% slice_max(favorite_count, n=10)

3.2. Remove unnecessary columns from “blm_tweets_most_favorited”

blm_tweets_most_favorited<- blm_tweets_most_favorited %>% 
                              select(created_at, screen_name, text, favorite_count)
blm_tweets_most_favorited
## # A tibble: 10 × 4
##    created_at          screen_name    text                        favorite_count
##    <dttm>              <chr>          <chr>                                <dbl>
##  1 2020-08-29 03:31:41 Blklivesmatter "Beyond painful. Rest in P…          53597
##  2 2021-01-06 19:42:10 Blklivesmatter "So we all just gonna act …          46103
##  3 2020-06-15 15:34:41 Blklivesmatter "Until today, you could be…          44755
##  4 2020-05-31 03:21:51 Blklivesmatter "We call for an END to sys…          33327
##  5 2020-05-29 21:39:43 Blklivesmatter "Rest in Power, Beautiful.…          32572
##  6 2020-05-26 17:38:48 Blklivesmatter "His name was George Floyd…          31084
##  7 2020-06-09 21:24:28 Blklivesmatter "You have changed us forev…          29776
##  8 2020-06-05 14:04:00 Blklivesmatter "Happy 27th birthday, Breo…          26147
##  9 2020-10-11 00:22:13 Blklivesmatter "#BlackLivesMatter rises w…          23265
## 10 2020-06-03 11:59:10 Blklivesmatter "When people take to the s…          22675

3.3 Query blm_tweets to find the 10 tweets with the most retweets and then select desired columns in one block of code

blm_tweets_most_retweeted<-blm_tweets %>% 
                              slice_max(retweet_count, n=10) %>% 
                              select(created_at, screen_name, text, retweet_count)
blm_tweets_most_retweeted
## # A tibble: 10 × 4
##    created_at          screen_name    text                         retweet_count
##    <dttm>              <chr>          <chr>                                <dbl>
##  1 2020-08-27 02:50:05 Blklivesmatter "FUCK THIS MAN!!!! WE DEMAN…        264125
##  2 2020-10-11 01:40:00 Blklivesmatter "A thread on what’s happeni…         51097
##  3 2020-05-03 17:42:05 Blklivesmatter "*Blinks in BLM*  https://t…         48906
##  4 2021-01-07 12:40:38 Blklivesmatter "They've killed us for less…         43303
##  5 2020-06-09 00:10:55 Blklivesmatter "3 million students attend …         41545
##  6 2020-07-18 16:50:58 Blklivesmatter "55 years ago today, we wer…         40229
##  7 2020-12-24 00:46:49 Blklivesmatter "Move, Mitch, get out the w…         39516
##  8 2020-05-03 17:42:58 Blklivesmatter "Think about how harshly #B…         39207
##  9 2020-06-14 16:39:14 Blklivesmatter "A heartbreaker. \n\nNext w…         28458
## 10 2021-01-18 18:38:11 Blklivesmatter "A thread of Dr. King in co…         28395

3.4 Remove retweets from blm_tweets

blm_tweets_noretweets<-blm_tweets %>% filter(is_retweet=="FALSE")

3.6 # Query the data to find the 5 handles that have most frequently used #CancelStudentLoan

student_debt_tweets_frequentweeters<-student_debt_tweets %>% 
                                      count(screen_name) %>% 
                                      slice_max(n, n=5)

3.7 Query the data to find the 10 hashtags appearing most frequently in conjunction with #CancelStudentDebt

CancelStudentDebt_coinciding_hashtags<-student_debt_tweets %>% 
                                          select(hashtags) %>% 
                                          unnest(hashtags) %>%
                                          mutate(hashtag_cleaned=str_to_lower(hashtags)) %>% 
                                          filter(hashtag_cleaned!="cancelstudentdebt") %>% 
                                          select(-hashtag_cleaned) %>% 
                                          count(hashtags) %>% 
                                          slice_max(n, n=10)

4. Visualize Data

4.1 Using ggplot to make visualizations of twitter data: bar graph of coincident hashtags

CancelStudentDebt_coinciding_hashtags<-CancelStudentDebt_coinciding_hashtags %>% 
                                        mutate(hashtag=paste0("#", hashtags))

coincident_hashtags_plot<-
  ggplot(CancelStudentDebt_coinciding_hashtags, aes(x=reorder(hashtag, n), y=n))+
    geom_bar(stat="identity")+
      coord_flip()+
      xlab("")+
      ylab("Frequency")+
      ggtitle("Hashtags Most Frequently Used Along With #CancelStudentDebt")+
      labs(caption = "Data Collected from Twitter REST API via rtweet")
coincident_hashtags_plot

4.2. Using rtweet’s visualization functions: time series example

ts_plot(student_debt_tweets, "hours") +
  labs(x = NULL, y = NULL,
       title = "Frequency of tweets with a #CancelStudentDebt hashtag",
       subtitle = paste0(format(min(student_debt_tweets$created_at), "%d %B %Y"), 
                         " to ",  
                         format(max(student_debt_tweets$created_at),"%d %B %Y")),
       caption = "Data collected from Twitter's REST API via rtweet") +
  theme_minimal()

4.3 Mapping Tweets

# Extract lat/longs
student_debt_tweets<-student_debt_tweets %>% lat_lng()

# remove records without geotags
student_debt_tweets_latlong_extract<-student_debt_tweets %>% 
                                      filter(is.na(lat) == FALSE & is.na(lng) == FALSE)

# create sf object from tweet dataset
student_debt_tweets_latlong_extract<-student_debt_tweets_latlong_extract %>% 
                                      st_as_sf(coords=c("lng", "lat")) %>% 
                                      st_set_crs("EPSG:4326")

# set tmap to view mode
tmap_mode("view")
## tmap mode set to interactive viewing
# make map
tm_shape(student_debt_tweets_latlong_extract)+
  tm_dots()

4.4. Make a Word Cloud of a Twitter Handle

First, prepare a word frequency table:

blm_text<-str_c(blm_tweets$text, collapse="")


blm_text <- 
  blm_text %>%
  str_remove("\\n") %>%                   # remove linebreaks
  rm_twitter_url() %>%                    # Remove URLS
  rm_url() %>%
  str_remove_all("#\\S+") %>%             # Remove any hashtags
  str_remove_all("@\\S+") %>%             # Remove any @ mentions
  removeWords(stopwords("english")) %>%   # Remove common words (a, the, it etc.)
  removeNumbers() %>%
  stripWhitespace() %>%
  removeWords(c("amp", "the")) %>% 
  removePunctuation() %>% 
  str_remove_all(pattern='[Tt]he') %>% 
  str_remove_all(pattern='[:emoji:]')

textCorpus <- 
  Corpus(VectorSource(blm_text)) %>%
  TermDocumentMatrix() %>%
  as.matrix()

textCorpus <- sort(rowSums(textCorpus), decreasing=TRUE)
textCorpus <- data.frame(word = names(textCorpus), freq=textCorpus, row.names = NULL)
View(textCorpus)
word freq
black 844
police 403
will 352
people 351
today 296
now 280

Make a word cloud using the word frequency table

wordcloud_blm <- wordcloud2(data = textCorpus, minRotation = 0, maxRotation = 0, ellipticity = 0.2)
wordcloud_blm

You can write out your word cloud to disk with the following:

install_phantomjs()
saveWidget(wordcloud_blm, "blm.html", selfcontained = F)
webshot("blm.html", "blm.png", vwidth=1000, vheight=1000, delay=10)

5. Writing Functions and Automating Your Twitter Analysis

Let’s say you expect to build a lot of word clouds, and don’t want to keep copy pasting your code. At that point, it makes sense to write a function that’ll automatically create a word cloud, based on the inputs (twitter handle, number of tweets to pull from the API, upto 3200) that you supply.

5a. Wrap the code to create a word cloud into a function

twitter_wordcloud<-function(twitterhandle, tweet_number){
  tweet_timeline<-get_timeline(twitterhandle, n=tweet_number)
  tweet_timeline_text<-str_c(tweet_timeline$text, collapse="")

    tweet_timeline_text<-tweet_timeline_text %>%
    str_remove("\\n") %>%                   # remove linebreaks
    rm_twitter_url() %>%                    # Remove URLS
    rm_url() %>%
    str_remove_all("#\\S+") %>%             # Remove any hashtags
    str_remove_all("@\\S+") %>%             # Remove any @ mentions
    removeWords(stopwords("english")) %>%   # Remove common words (a, the, it etc.)
    removeNumbers() %>%
    stripWhitespace() %>%
    removeWords(c("amp")) %>% 
    removePunctuation() %>% 
    str_remove_all(pattern='[Tt]he') %>% 
    str_remove_all(pattern='[:emoji:]')
    
  
  textCorpus <- 
    Corpus(VectorSource(tweet_timeline_text)) %>%
    TermDocumentMatrix() %>%
    as.matrix()
  
  textCorpus <- sort(rowSums(textCorpus), decreasing=TRUE)
  textCorpus <- data.frame(word = names(textCorpus), freq=textCorpus, row.names = NULL)

  wordcloud <- wordcloud2(data = textCorpus, minRotation = 0, maxRotation = 0, ellipticity = 0.2)
  return(wordcloud)
  
}

5b. Test the function

# Generate word cloud for past 400 NYT twitter posts, assign to object, and view word cloud
nyt_wordcloud<-twitter_wordcloud("nytimes", 400)
# View NYT wordcloud 
nyt_wordcloud

5c. Iteratively Apply the Function to Multiple Twitter Handles

Apply the “twitter_wordcloud” function created above to multiple handles, and generate multiple word clouds based on those handles. We’ll apply the function to the Twitter handles of the New York Times, Financial Times, Washington Post, Fox News, CNN, and the Denver Post.

handles<-c("nytimes", "FinancialTimes", "FoxNews", "cnn", "washingtonpost", "denverpost")
number<-c(400)
wordcloud_list<-map2(.x=handles, .y=number, twitter_wordcloud)

View the Washington Post word cloud by accessing it from the list:

# View Washington Post Word Cloud
wordcloud_list[["washingtonpost"]]

View the Denver Post word cloud by accessing from the list:

# View Denver Post word cloud by extracting it from the list
wordcloud_list[["denverpost"]]

What would you type if you want to extract the Financial Times word cloud from the list?

5d. Iteratively Write Out All of the Media Word Clouds to Disk

# Write function that takes list of word clouds, and word cloud names, and writes WC out to tisk
output_wordclouds<-function(wordclouds_to_export, wordcloud_names){
  setwd("/Users/adra7980/Documents/git_repositories/twitter_workshop/wordclouds")
  install_phantomjs()
  saveWidget(wordclouds_to_export, paste0(wordcloud_names, ".html"), selfcontained=F)
  webshot(paste0(wordcloud_names, ".html"), paste0(wordcloud_names, ".png"), vwidth=1992, vheight=1744, delay=10)
}
# iteratively apply previous function across word clouds in list and write all to disk
map2(.x=wordcloud_list, .y=names(wordcloud_list), .f=output_wordclouds)